*******************************************************************************
*
* Code to construct analysis data from the raw LDAR data
*
*******************************************************************************

* First, make sure that directory is set to the appropriate location
* (where the LDAR_RCT_replication_package is located)


*******************************************************************************
* Initial Set up
*******************************************************************************

clear all
set maxvar 11000


*******************************************************************************
* Building files
*******************************************************************************
	
**********************************
* Component emissions:



	import excel using "Data/Raw/Wang_et_al_data/Wang_expanded_sample.xlsx", ///
		sheet("Component_Emissions") first clear

	// Drops the Large Facility with Gathering System cases
	drop if SiteType == "Large Facility w/ Gathering System"

	// Cleans survey to be numeric
	destring Survey, ignore("Survey ") replace
	
	* Keep only Survey 1 and Survey 5 (baseline and endline)
	destring Survey, ignore("Survey ") replace
	keep if inlist(Survey,1,5)
	
	* Counts number of leaks in component data (and we'll compare that with those in Tag data)
	count if VentLeak == "Leak" & Survey==1 & TreatmentGroup != "Control" // 438
	
	* First aggregates to Survey x Site level x whether Vent vs. Leak
	egen Tot_comp_ = total(1), by(Site TreatmentGroup Survey VentLeak)
	egen Tot_emiss_ = total(Emissionkgd), by(Site TreatmentGroup Survey VentLeak)
	egen Max_emiss_ = max(Emissionkgd), by(Site TreatmentGroup Survey VentLeak)
	keep Site TreatmentGroup Survey VentLeak Tot_* Max_*
	duplicates drop
	reshape wide Tot_* Max_*, i(Site TreatmentGroup Survey) j(VentLeak) string
	* ==> Now we have one observation per Site x Survey
	
	* Then reshapes to wide again
	reshape wide Tot_* Max_* , i(Site TreatmentGroup) j(Survey)
	* ==> Now we have one observation per Site 
	
	/* Fills in zeros ... because we had unbalanced endline vs. baseline count
	   and so now that it is wide (endline and baseline on the same line), 
	   cases with missing for one of those get filled in as zeros     */
	foreach var of varlist Tot_* Max_* {
		replace `var' = 0 if missing(`var')
	}
	
	* Saves
	duplicates report Site // no duplicates
	tempfile component_wide
	save "`component_wide'"

	
******************************
* Site emissions:


	import excel using "Data/Raw/Wang_et_al_data/Wang_expanded_sample.xlsx", ///
		sheet("Site_Emissions") first clear

	tempfile site
	save "`site'"
		
	* Turns Operator into numeric
	encode Operator, gen(Operator_temp)
	drop Operator
	rename Operator_temp Operator 

	
	* Destrings  variables that should be numeric -- new code:
	foreach var of varlist Production* {
		replace `var' = ".n" if `var'=="NA"
		destring `var', replace ignore(",")
	}
	
	* Destrings the survey variable
	destring Survey, ignore("Survey ") replace

	* Keeps only baseline and endline
	keep if inlist(Survey,1,5)

	* Reshapes wide 
	foreach var of varlist Emissionskgd-ProductionOilEnergyGJmonth Operator {
		rename `var' `var'_
	}
	reshape wide Emissionskgd-ProductionOilEnergyGJmonth Operator, i(Site SiteType TreatmentGroup) j(Survey)
	describe

	* Saves:
	tempfile site_wide
	save "`site_wide'", replace


*******************************************************************************
*
*     MERGING -- SITE LEVEL
* 
*******************************************************************************

	* Merge site emissions with repairs
	use "`site_wide'" , clear
	
	* Merge site emissions with vented emissions and flared emissions data from the component database:
	merge 1:1 Site using "`component_wide'"
	* The ones that don't merge in components are places with zero emissions -- so fills those in as zeros
	foreach var of varlist Tot* Max* {
		replace `var' = 0 if _merge==1 // if not in component_wide data, then assume zero leaks
	}
	drop _merge 
	
*******************************************************************************
*
*     NEW VARIABLES
* 
*******************************************************************************


* Treatment group variables 
	gen Treatment_Ind = TreatmentGroup != "Control"
	label var Treatment_Ind "Treatment" 
	
	gen TreatmentGroup_R = 0 if TreatmentGroup == "Control"
	replace TreatmentGroup_R = 1 if TreatmentGroup == "Annual"
	replace TreatmentGroup_R = 2 if TreatmentGroup == "Bi-Annual"
	replace TreatmentGroup_R = 3 if TreatmentGroup == "Tri-Annual"
	label define TreatmentGroup_R 0 "Control" 1 "Annual" 2 "Biannual" 3 "Triannual"
	label values TreatmentGroup_R TreatmentGroup_R

	
	gen Annual = TreatmentGroup == "Annual"
	gen Biannual = TreatmentGroup == "Bi-Annual"
	gen Triannual = TreatmentGroup == "Tri-Annual"
	gen Control = TreatmentGroup == "Control"

	
	label var Annual "Annual"
	label var Biannual "Biannual"
	label var Triannual "Triannual"
	label var Control "Control"
	
	gen Treatment_Intensity = TreatmentGroup_R - 1 if TreatmentGroup_R != 0 // Ranges from 0 (annual) to 2 (triannual)
	replace Treatment_Intensity = 0 if Treatment_Ind == 0
	label var Treatment_Intensity "Treatment Intensity"

* Whether has outlier baseline leakage, venting, or total emissions:
	sum Tot_emiss_Leak1, detail
	gen is_outlier_Leak1 = Tot_emiss_Leak1 >= r(p95)
	sum Tot_emiss_Vent1, detail
	gen is_outlier_Vent1 = Tot_emiss_Vent1 >= r(p95)
	gen is_outlier_Either1 = is_outlier_Leak1 | is_outlier_Vent1

	
* Site type indicators:

	gen GasSite = inlist(SiteType,"Gas MW","Gas SW")
	gen MWSite = inlist(SiteType,"Gas MW", "Oil MW", "Oil MWPro")
	
	label var GasSite "Gas Site"
	label var MWSite "Multiwell Site"
	
* Site start date (in years + (month-1)/12)
	gen Site_Start_YRM = Site_start_year + (Site_start_month-1)/12
	
* Transformations of gas, emissions, leakage, and venting:
	gen Tot_comp_either1 = Tot_comp_Leak1 + Tot_comp_Vent1
	gen Tot_comp_either5 = Tot_comp_Leak5 + Tot_comp_Vent5

	gen change_Emissions = Emissionskgd_5 - Emissionskgd_1
	gen change_Leak = Tot_emiss_Leak5 - Tot_emiss_Leak1 
	gen change_Vent = Tot_emiss_Vent5 - Tot_emiss_Vent1 

	label var change_Emissions "change in emissions"
	label var change_Leak "change in leakage"
	label var change_Vent "change in venting"

	* Shortens the production variable names:
	rename ProductionGas103m3month_1 ProdGas_1
	rename ProductionGas103m3month_5 ProdGas_5
	rename ProductionOilm3month_1    ProdOil_1
	rename ProductionOilm3month_5    ProdOil_5
	
	* Transformations:
	foreach var of varlist Emissionskgd_? ProdGas_? ///
			ProdOil_? Tot_emiss_Leak? Tot_emiss_Vent? ///
			Tot_comp_Leak? Tot_comp_Vent? {
		* whether positive
		gen pos_`var' = `var'>0 if ~missing(`var')

		* True log (logN):
		gen logN_`var' = log(`var')
		
		* "log" of -- where is defined as zero if variable is zero
		gen log_`var' = log((`var'==0) + (`var'>0)*`var') if ~missing(`var')
		
		* log + 1
		gen log1_`var' = log(1 + `var')
		
		* inverse hyperbolic sine
		gen isnh_`var' = log(`var' + (`var'^2 + 1)^0.5) if ~missing(`var')
		
	}
	* Interaction with treatment:
	foreach var of varlist Emissionskgd_1 ProdGas_1 ProdOil_1 ///
		Tot_emiss_Leak1 Tot_emiss_Vent1 Tot_comp_Leak1 Tot_comp_Vent1 {
			
		* Interaction with treatment
		gen Treat_`var' = Treatment_Ind * `var'
		
		* Interaction with whether positive:
		gen Treat_pos_`var' = Treatment_Ind * pos_`var'

		* Interaction with true log:
		gen Treat_logN_`var' = Treatment_Ind * log_`var'
		
		* Interaction with "log" of variable:
		gen Treat_log_`var' = Treatment_Ind * log_`var'
		
		* Interaction with log+1 variable:
		gen Treat_log1_`var' = Treatment_Ind * log1_`var'
		
		* inverse hyperbolic sine
		gen Treat_isnh_`var' = Treatment_Ind * isnh_`var'
		
	}
	
	
	
	* Interaction of baseline leakage with treatment intensity:
	gen TI_log1_Leak1 = Treatment_Intensity * log1_Tot_emiss_Leak1
	

	* Other interaction variables:
	gen Treat_GasSite = Treatment_Ind * GasSite
	
	* More labels: 
	label var Tot_emiss_Leak1 "Baseline leakage"
	label var Tot_emiss_Leak5 "Endline leakage"
	label var Tot_emiss_Vent1 "Baseline venting"
	label var Tot_emiss_Vent5 "Endline venting"
	label var Emissionskgd_1 "Baseline emissions"
	label var Tot_comp_Leak1 "Baseline \# leaks"
	label var Tot_comp_Leak5 "Endline \# leaks"
	label var Tot_comp_Vent1 "Baseline \# vents"
	label var Tot_comp_Vent5 "Endline \# vents"
	label var isnh_Tot_emiss_Leak1 "sinh\textsuperscript{-1}(base leakage)"
	label var isnh_Emissionskgd_1 "sinh\textsuperscript{-1}(baseline emissions)"
	label var isnh_Tot_comp_Leak1 "sinh\textsuperscript{-1}(baseline \# leaks)"
	label var isnh_Tot_comp_Leak5 "sinh\textsuperscript{-1}(endline \# leaks)"
	label var isnh_Tot_comp_Vent1 "sinh\textsuperscript{-1}(baseline \# vents)"
	label var isnh_Tot_comp_Vent5 "sinh\textsuperscript{-1}(endline \# vents)"
	label var log1_Tot_emiss_Leak1 "log(1 + baseline leakage)"
	label var log1_Tot_emiss_Leak5 "log(1 + endline leakage)"
	label var log1_Tot_emiss_Vent1 "log(1 + baseline venting)"
	label var log1_Tot_emiss_Vent5 "log(1 + endline venting)"
	label var log1_Emissionskgd_1 "log(1 + baseline emissions)"

	label var log1_ProdGas_1 "log(1 + baseline gas production)"
	label var log1_ProdOil_1 "log(1 + baseline oil production)"
	
	label var Treat_log1_Tot_emiss_Leak1 "Treatment * log(1+\base. leakage)"
	label var Treat_log1_Tot_emiss_Vent1 "Treatment * log(1+base. venting)"
	
	label var TI_log1_Leak1 "Treat. Intens. * log(1+base leakage) "
	
	
	
*******************************************************************************
*
*     Creates firm level variables for control sites giving info on fraction of sites
*       treated more intensively, whether treated sites tend to have higher emissions
* 
*******************************************************************************

	* Fraction of sites that are treated more intensively
	gen is_bitri = inlist(TreatmentGroup, "Bi-Annual", "Tri-Annual") if TreatmentGroup != "Control"
	gen is_tri = TreatmentGroup == "Tri-Annual" if TreatmentGroup != "Control"
	
	egen mean_bitri_temp = mean(is_bitri) if TreatmentGroup != "Control", by(Operator_1)
	egen mean_bitri = mean(mean_bitri_temp), by(Operator_1)
	drop mean_bitri_temp

	egen mean_tri_temp = mean(is_tri) if TreatmentGroup != "Control", by(Operator_1)
	egen mean_tri = mean(mean_tri_temp), by(Operator_1)
	drop mean_tri_temp
	drop is_tri is_bitri
		
	* Qreg emissions to see whether actual emissions at treatment sites for a given
	* firm are higher or lower than average
	encode TreatmentGroup, gen(TreatmentGroup_encoded)
	encode SiteType, gen(SiteType_R)
	bsqreg Tot_emiss_Leak1 i.TreatmentGroup_R i.SiteType_R if TreatmentGroup != "Control"
	predict xb if TreatmentGroup != "Control"
	gen higher_than_average = Tot_emiss_Leak1 >= xb if TreatmentGroup != "Control"
	drop xb
	
	egen mean_has_leaks_above_med_temp = mean(higher_than_average) if TreatmentGroup != "Control", by(Operator_1)
	egen mean_has_leaks_above_med = mean(mean_has_leaks_above_med_temp), by(Operator_1)
	drop mean_has_leaks_above_med_temp
	
	* Keeps these variables only for control sites
	foreach var of varlist mean_bitri mean_tri mean_has_leaks_above_med {
		replace `var' = . if TreatmentGroup != "Control"
	}
	

*******************************************************************************
*
*     Saves
* 
*******************************************************************************

save "Data/Intermediate/site_level_for_analysis.dta", replace

